Introduction

Trying to classify the distribution family of data.

Methods

Parameters and Packages

library(ggplot2)
library(plotly)
library(parameters)
library(bayestestR)
library(report)
library(caret)
library(parallel)
library(doParallel)

set.seed(333)

Distributions Description

distributions <- data.frame()
size <-  1000
for(location in round(seq(0.01, 10, length.out = 5), digits=2)){
  for(scale in round(seq(0.02, 10, length.out = 5), digits=2)){
    x <- parameters::normalize(as.data.frame(density(rnorm(size, location, scale), n=100)))
    x$distribution <- "normal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rbeta(size, location, scale), n=100)))
    x$distribution <- "beta"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rbinom(size, round(location)+1, scale/10^(nchar(round(scale)))), n=100)))
    x$distribution <- "binomial"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rchisq(size, location, scale), n=100)))
    x$distribution <- "chi"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rexp(size, scale), n=100)))
    x$distribution <- "exponential"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rf(size, location, scale), n=100)))
    x$distribution <- "F"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rgamma(size, location, scale), n=100)))
    x$distribution <- "gamma"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rlnorm(size, location, scale), n=100)))
    x$distribution <- "lognormal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rpois(size, location), n=100)))
    x$distribution <- "poisson"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rt(size, location, scale), n=100)))
    x$distribution <- "t"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(runif(size, location, location*2), n=100)))
    x$distribution <- "uniform"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
    
    x <- parameters::normalize(as.data.frame(density(rweibull(size, location, scale), n=100)))
    x$distribution <- "weibull"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
  }
}

p <- ggplot(distributions, aes(x=x, y=y, colour=distribution)) +
  geom_line(size=1) +
  facet_grid(location ~ scale) +
  theme_classic()
ggplotly(p)

Results

Model Selection

Data Generation

generate_distribution <- function(family="normal", size=1000, noise=0, location=0, scale=1){
  if(family == "normal"){
    x <- rnorm(size, location, scale)
  } else if(family == "beta"){
    x <- rbeta(size, location, scale)
  } else if(family == "binomial"){
    x <- rbinom(size, round(location)+1, scale/10^(nchar(round(scale))))
  } else if(family == "chi"){
    x <- rchisq(size, location, scale)
  } else if(family == "exponential"){
    x <- rexp(size, scale)
  } else if(family == "F"){
    x <- rf(size, location, scale+0.1)
  } else if(family == "gamma"){
    x <- rgamma(size, location, scale)
  } else if(family == "lognormal"){
    x <- rlnorm(size, location, scale)
  } else if(family == "poisson"){
    x <- rpois(size, location)
  } else if(family == "t"){
    x <- rt(size, location, scale)
  } else if(family == "uniform"){
    x <- runif(size, location, location*2)
  } else if(family == "weibull"){
    x <- rweibull(size, location, scale)
  }
  return(x)
}



df <- data.frame()
for(distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")){
  for(i in 1:10){
    
    size <- round(runif(1, 10, 10000))
    location <- runif(1, 0.01, 10)
    scale <- runif(1, 0.02, 10)
    
    x <- generate_distribution(distribution, size=size, location=location, scale=scale)
    
    density_Z <- parameters::normalize(density(x, n=20)$y)
    
    # Extract features
    data <- data.frame(
      "Mean" = mean(x),
      "SD" = sd(x),
      "Median" = median(x),
      "MAD" = mad(x, constant=1),
      "Mean_Median_Distance" = mean(x) - median(x),
      "Mean_Mode_Distance" = mean(x) - bayestestR::map_estimate(x),
      "SD_MAD_Distance" = sd(x) - mad(x, constant=1),
      "Mode" = bayestestR::map_estimate(x),
      "Range" = diff(range(x)) / sd(x),
      "IQR" = stats::IQR(x),
      "Skewness" = skewness(x),
      "Kurtosis" = kurtosis(x),
      "Smoothness_Cor" = parameters::smoothness(density(x)$y, method="cor"),
      "Smoothness_Diff" = parameters::smoothness(density(x)$y, method="diff"),
      "Smoothness_Z_Cor_1" = parameters::smoothness(density_Z, method="cor", lag=1),
      "Smoothness_Z_Diff_1" = parameters::smoothness(density_Z, method="diff", lag=1),
      "Smoothness_Z_Cor_3" = parameters::smoothness(density_Z, method="cor", lag=3),
      "Smoothness_Z_Diff_3" = parameters::smoothness(density_Z, method="diff", lag=3)
    )
    
    
    density_df <- as.data.frame(t(density_Z))
    names(density_df) <- paste0("Density_", 1:ncol(density_df))
    data <- cbind(data, density_df)
    
    data$Distribution <- distribution
    df <- rbind(df, data)
  }
}

Preparation

# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]

# Data partitioning
trainIndex <- caret::createDataPartition(df$Distribution, p=0.8, list = FALSE)
train <- df[ trainIndex,]
test  <- df[-trainIndex,]


# Parameters
fitControl <- caret::trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           ## repeated ten times
                           repeats = 10,
                           classProbs = TRUE,
                           returnData = FALSE,
                           trim=TRUE,
                           allowParallel = TRUE)

# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

Training

# Training
model_rf <- caret::train(Distribution ~ ., data = train, 
                 method = "rf", 
                 trControl = fitControl)

model_nb <- caret::train(Distribution ~ ., data = train,
                 method = "naive_bayes",
                 trControl = fitControl)

model_gbm <- caret::train(Distribution ~ ., data = train, 
                 method = "gbm", 
                 trControl = fitControl)

model_nnet <- caret::train(Distribution ~ ., data = train, 
                 method = "nnet", 
                 trControl = fitControl,
                 linout=0)

stopCluster(cluster) # explicitly shut down the cluster

Model Comparison

# collect resamples
results <- resamples(list(
  "RandomForest" = model_rf,
  "NaiveBayes"= model_nb,
  "GBM"= model_gbm,
  "Nnet" = model_nnet
  ))

# summarize the distributions
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: RandomForest, NaiveBayes, GBM, Nnet 
## Number of resamples: 50 
## 
## Accuracy 
##                    Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## RandomForest 0.65000000 0.7739899 0.8248082 0.8176960 0.8553571 0.9500000
## NaiveBayes   0.38888889 0.6190476 0.6842105 0.6883850 0.7619048 0.9000000
## GBM          0.65000000 0.7727273 0.8090909 0.8118162 0.8620130 0.9473684
## Nnet         0.09090909 0.1625387 0.2222222 0.2160156 0.2631579 0.3529412
##              NA's
## RandomForest    0
## NaiveBayes      0
## GBM             0
## Nnet            0
## 
## Kappa 
##                    Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## RandomForest 0.61748634 0.7523315 0.8076304 0.7997679 0.8420609 0.9452055
## NaiveBayes   0.33557047 0.5833731 0.6545423 0.6584340 0.7386431 0.8901099
## GBM          0.61643836 0.7512717 0.7910882 0.7933400 0.8493344 0.9420732
## Nnet         0.02173913 0.1019144 0.1669399 0.1574988 0.2130178 0.2889734
##              NA's
## RandomForest    0
## NaiveBayes      0
## GBM             0
## Nnet            0
# dot plots of results
dotplot(results)

# Sizes
data.frame("RandomForest" = as.numeric(object.size(model_rf))/1000,
           "NaiveBayes" = as.numeric(object.size(model_nb))/1000,
           "GBM" = as.numeric(object.size(model_gbm))/1000,
           "Nnet" = as.numeric(object.size(model_nnet))/1000)
##   RandomForest NaiveBayes      GBM    Nnet
## 1     1153.152    445.848 3427.632 863.112

Best Model Inspecction

model <- model_rf

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))
Performance
Accuracy 0.7916667
Kappa 0.7727273
AccuracyLower 0.5784872
AccuracyUpper 0.9286814
AccuracyNull 0.0833333
AccuracyPValue 0.0000000
McnemarPValue NaN
# Prediction Table
knitr::kable(confusion$table)
beta binomial chi exponential F gamma lognormal normal poisson t uniform weibull
beta 1 0 0 0 0 0 0 0 0 0 0 0
binomial 0 1 0 0 0 0 0 0 0 0 0 0
chi 0 0 2 0 0 1 0 0 0 0 0 1
exponential 1 0 0 2 0 0 0 0 0 0 0 0
F 0 0 0 0 2 0 0 0 0 1 0 0
gamma 0 0 0 0 0 1 0 0 0 0 0 0
lognormal 0 0 0 0 0 0 2 0 0 0 0 0
normal 0 0 0 0 0 0 0 2 0 0 0 0
poisson 0 1 0 0 0 0 0 0 2 0 0 0
t 0 0 0 0 0 0 0 0 0 1 0 0
uniform 0 0 0 0 0 0 0 0 0 0 2 0
weibull 0 0 0 0 0 0 0 0 0 0 0 1
# Prediction Figure
perf <-  as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")


ggplot(perf, aes(x=Distribution, y=Metric, fill=Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept=0.5), linetype="dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Features
features <- caret::varImp(model, scale = TRUE)
plot(features)

Random Forest Training with Feature Selection

Data Generation

df <- data.frame()
for(distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")){
  for(i in 1:10000){
    
    size <- round(runif(1, 10, 10000))
    location <- runif(1, 0.01, 10)
    scale <- runif(1, 0.02, 10)
    
    x <- generate_distribution(distribution, size=size, location=location, scale=scale)
    
    density_Z <- parameters::normalize(density(x, n=100)$y)
    
    # Extract features
    data <- data.frame(
      "Mean" = mean(x),
      "SD" = sd(x),
      "Median" = median(x),
      "MAD" = mad(x, constant=1),
      "Mean_Median_Distance" = mean(x) - median(x),
      "Mean_Mode_Distance" = mean(x) - bayestestR::map_estimate(x),
      "SD_MAD_Distance" = sd(x) - mad(x, constant=1),
      "Mode" = bayestestR::map_estimate(x),
      "Range" = diff(range(x)) / sd(x),
      "IQR" = stats::IQR(x),
      "Skewness" = skewness(x),
      "Kurtosis" = kurtosis(x),
      "Smoothness_Cor_1" = parameters::smoothness(density_Z, method="cor", lag=1),
      "Smoothness_Diff_1" = parameters::smoothness(density_Z, method="diff", lag=1),
      "Smoothness_Cor_5" = parameters::smoothness(density_Z, method="cor", lag=5),
      "Smoothness_Diff_5" = parameters::smoothness(density_Z, method="diff", lag=5)
    )
    
    data$Distribution <- distribution
    df <- rbind(df, data)
  }
}

Preparation

# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]

# Data partitioning
trainIndex <- caret::createDataPartition(df$Distribution, p=0.8, list = FALSE)
train <- df[ trainIndex,]
test  <- df[-trainIndex,]


# Parameters
fitControl <- caret::trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           ## repeated ten times
                           repeats = 10,
                           classProbs = TRUE,
                           returnData = FALSE,
                           trim=TRUE,
                           allowParallel = TRUE)

# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

Training

# Training
model <- caret::train(Distribution ~ ., data = train, 
                 method = "rf", 
                 trControl = fitControl)

stopCluster(cluster) # explicitly shut down the cluster

Model Inspecction

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))
Performance
Accuracy 0.9768654
Kappa 0.9747622
AccuracyLower 0.9748834
AccuracyUpper 0.9787306
AccuracyNull 0.0833681
AccuracyPValue 0.0000000
McnemarPValue NaN
# Prediction Table
knitr::kable(confusion$table)
beta binomial chi exponential F gamma lognormal normal poisson t uniform weibull
beta 1987 0 0 5 0 19 0 2 0 0 0 16
binomial 1 1955 0 0 1 0 0 0 49 0 1 1
chi 0 0 1977 3 0 40 1 1 0 3 0 49
exponential 4 0 5 1985 0 46 0 0 0 0 0 24
F 0 0 2 4 1985 1 6 0 0 3 0 15
gamma 4 1 11 2 2 1871 20 0 1 6 0 27
lognormal 0 0 0 0 7 2 1953 1 0 10 0 4
normal 0 1 0 0 0 0 0 1967 2 2 0 11
poisson 0 39 0 0 1 0 0 0 1942 2 0 3
t 0 0 1 0 3 4 9 0 2 1967 0 0
uniform 3 0 0 0 0 0 0 0 1 1 1998 1
weibull 1 3 4 1 0 17 11 29 2 0 1 1848
# Prediction Figure
perf <-  as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")


ggplot(perf, aes(x=Distribution, y=Metric, fill=Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept=0.5), linetype="dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Features
features <- caret::varImp(model, scale = TRUE)
plot(features)

Reduce Size and Save

# Reduce size
model$finalModel$predicted <- NULL 
model$finalModel$oob.times <- NULL 
model$finalModel$y <- NULL
model$finalModel$votes <- NULL
model$control$indexOut <- NULL
model$control$index    <- NULL
model$trainingData <- NULL

attr(model$terms,".Environment") <- c()
attr(model$formula,".Environment") <- c()

# Test 
is.factor(predict(model, df))
## [1] TRUE
# Save
find_distribution <- model
save(find_distribution, file="find_distribution.rda")

References

Package Version References
bayestestR 0.1.0 Makowski, D. & Lüdecke, D. (2019). Understand and Describe Bayesian Models and Posterior Distributions using BayestestR. CRAN. Available from https://github.com/easystats/bayestestR. DOI: 10.5281/zenodo.2556486.
caret 6.0.81 Max Kuhn. Contributions from Jed Wing, Steve Weston, Andre Williams, Chris Keefer, Allan Engelhardt, Tony Cooper, Zachary Mayer, Brenton Kenkel, the R Core Team, Michael Benesty, Reynald Lescarbeau, Andrew Ziem, Luca Scrucca, Yuan Tang, Can Candan and Tyler Hunt. (2018). caret: Classification and Regression Training. R package version 6.0-81. https://CRAN.R-project.org/package=caret
doParallel 1.0.14 Microsoft Corporation and Steve Weston (2018). doParallel: Foreach Parallel Adaptor for the ‘parallel’ Package. R package version 1.0.14. https://CRAN.R-project.org/package=doParallel
foreach 1.4.4 Microsoft and Steve Weston (2017). foreach: Provides Foreach Looping Construct for R. R package version 1.4.4. https://CRAN.R-project.org/package=foreach
ggplot2 3.1.0 H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
iterators 1.0.10 Revolution Analytics and Steve Weston (2018). iterators: Provides Iterator Construct for R. R package version 1.0.10. https://CRAN.R-project.org/package=iterators
lattice 0.20.38 Sarkar, Deepayan (2008) Lattice: Multivariate Data Visualization with R. Springer, New York. ISBN 978-0-387-75968-5
parameters 0.1.0 Dominique Makowski and Daniel Lüdecke (2019). parameters: Processing of Model Parameters. R package version 0.1.0.
plotly 4.8.0 Carson Sievert (2018) plotly for R. https://plotly-book.cpsievert.me
report 0.1.0 Makowski, D. & Lüdecke, D. (2019). The report package for R: Ensuring the use of best practices for results reporting. CRAN. Available from https://github.com/easystats/report. doi: .